In [2]:
import sys
import pickle
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cross_validation import train_test_split, StratifiedShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from tester import test_classifier, dump_classifier_and_data
from sklearn import ensemble
from sklearn.ensemble import AdaBoostClassifier
from sklearn.grid_search import GridSearchCV
In [3]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary','deferral_payments', 'total_payments', 'loan_advances',
'bonus', 'restricted_stock_deferred', 'deferred_income', 'total_stock_value',
'expenses', 'exercised_stock_options', 'other', 'long_term_incentive',
'restricted_stock', 'director_fees', 'to_messages', 'from_poi_to_this_person',
'from_messages', 'from_this_person_to_poi',
'shared_receipt_with_poi'] # You will need to use more features
In [4]:
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
data_dict = pickle.load(data_file)
In [5]:
# Let look at some dataset statistcs
print "# of records : ", len(data_dict)
In [6]:
# POIs vs Non POIs
non_poi_count = 0
for p in data_dict.values():
if p['poi']:
non_poi_count += 1
print "# of POIs: ", non_poi_count
print "# of Non POIs: ", len(data_dict) - non_poi_count
In [7]:
# Missing values in feautres
print "# of missing values in features: "
NaNInFeatures = [0 for i in range(len(features_list))]
for i, person in enumerate(data_dict.values()):
for j, feature in enumerate(features_list):
if person[feature] == 'NaN':
NaNInFeatures[j] += 1
for i, feature in enumerate(features_list):
print feature, NaNInFeatures[i]
In [9]:
### Task 2: Remove outliers
# Let's look at the names.
s = []
for person in data_dict.keys():
s.append(person)
if len(s) == 4:
print '{:<30}{:<30}{:<30}{:<30}'.format(s[0],s[1],s[2],s[3])
s = []
print '{:<30}{:<30}'.format(s[0],s[1])
We see above that there is a entry called "TOTAL". That obvisously cannot be a name. We would need to remove that from the dataset. Before we do, let's confirm it is what it the name suggests it is.
In [10]:
print "print out some values of the observation 'TOTAL'"
for name, person in data_dict.iteritems():
if name == 'TOTAL':
print person
In [11]:
salary = []
for name, person in data_dict.iteritems():
if float(person['salary']) > 0:
salary.append(float(person['salary']))
print "the sum of salary of all other persons is: ",np.sum(salary)/2
We see that the total salary matches to the salary against the "TOTAL" record in the dataset.
In [12]:
# Let's remove this TOTAL record.
data_dict.pop('TOTAL')
Out[12]:
In [13]:
# There is a also a record which belongs to "THE TRAVEL AGENCY IN THE PARK".
# This is not a person and hence should be removed.
data_dict.pop("THE TRAVEL AGENCY IN THE PARK")
Out[13]:
In [15]:
# No of records after removal of TOTAL & THE TRAVEL AGENCY IN THE PARK
print "No of records after removal of TOTAL: ", len(data_dict)
In [16]:
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict
print "we create two new features here 'to_poi_message_ratio' and 'from_poi_message_ratio' "
for person in my_dataset.values():
person['to_poi_message_ratio'] = 0
person['from_poi_message_ratio'] = 0
if float(person['from_messages']) > 0:
person['to_poi_message_ratio'] = float(person['from_this_person_to_poi'])/float(person['from_messages'])
if float(person['to_messages']) > 0:
person['from_poi_message_ratio'] = float(person['from_poi_to_this_person'])/float(person['to_messages'])
features_list.extend(['to_poi_message_ratio', 'from_poi_message_ratio'])
In [17]:
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list)
labels, features = targetFeatureSplit(data)
In [31]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html
# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf = DecisionTreeClassifier(min_samples_split=6, random_state=10)
test_classifier(clf, my_dataset, features_list)
#clf = ensemble.RandomForestClassifier(criterion='gini', n_estimators=14, max_depth=7,
# max_features=None, random_state=42, min_samples_split=1)
#clf = AdaBoostClassifier(algorithm='SAMME')
#params = dict(reduce_dim__n_components=[1, 2, 3], tree__min_samples_split=[2, 4, 6, 8 10])
#clf = GridSearchCV(clf, param_grid=params, n_jobs=-1, scoring='recall')
#test_classifier(clf, my_dataset, features_list)
In [67]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
train_test_split(features, labels, test_size=0.3, random_state=42)
In [ ]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
dump_classifier_and_data(clf, my_dataset, features_list)